import data

bcdata = read_csv("bcdata_Assignment1.csv") |> janitor::clean_names()

Q1 get quantitative fesures

data_type = str(bcdata) # all variables are numeric.
## spc_tbl_ [116 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age           : num [1:116] 48 83 82 68 86 49 89 76 73 75 ...
##  $ bmi           : num [1:116] 23.5 20.7 23.1 21.4 21.1 ...
##  $ glucose       : num [1:116] 70 92 91 77 92 92 77 118 97 83 ...
##  $ insulin       : num [1:116] 2.71 3.12 4.5 3.23 3.55 ...
##  $ homa          : num [1:116] 0.467 0.707 1.01 0.613 0.805 ...
##  $ leptin        : num [1:116] 8.81 8.84 17.94 9.88 6.7 ...
##  $ adiponectin   : num [1:116] 9.7 5.43 22.43 7.17 4.82 ...
##  $ resistin      : num [1:116] 8 4.06 9.28 12.77 10.58 ...
##  $ mcp_1         : num [1:116] 417 469 555 928 774 ...
##  $ classification: num [1:116] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   BMI = col_double(),
##   ..   Glucose = col_double(),
##   ..   Insulin = col_double(),
##   ..   HOMA = col_double(),
##   ..   Leptin = col_double(),
##   ..   Adiponectin = col_double(),
##   ..   Resistin = col_double(),
##   ..   MCP.1 = col_double(),
##   ..   Classification = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary = skimr::skim(bcdata)# There's no missing in this dataset.
summary
Data summary
Name bcdata
Number of rows 116
Number of columns 10
_______________________
Column type frequency:
numeric 10
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 57.30 16.11 24.00 45.00 56.00 71.00 89.00 ▃▇▅▇▃
bmi 0 1 27.58 5.02 18.37 22.97 27.66 31.24 38.58 ▅▆▇▅▃
glucose 0 1 97.79 22.53 60.00 85.75 92.00 102.00 201.00 ▅▇▁▁▁
insulin 0 1 10.01 10.07 2.43 4.36 5.92 11.19 58.46 ▇▁▁▁▁
homa 0 1 2.69 3.64 0.47 0.92 1.38 2.86 25.05 ▇▁▁▁▁
leptin 0 1 26.62 19.18 4.31 12.31 20.27 37.38 90.28 ▇▃▂▁▁
adiponectin 0 1 10.18 6.84 1.66 5.47 8.35 11.82 38.04 ▇▅▂▁▁
resistin 0 1 14.73 12.39 3.21 6.88 10.83 17.76 82.10 ▇▂▁▁▁
mcp_1 0 1 534.65 345.91 45.84 269.98 471.32 700.08 1698.44 ▇▇▃▁▁
classification 0 1 1.55 0.50 1.00 1.00 2.00 2.00 2.00 ▆▁▁▁▇

Q2 catogorize bmi based on WHO criteria

bcdata = bcdata |> mutate(who_bmi =
                   ifelse(bmi < 16.5, "Severely underweight",
                          ifelse(16.5 <= bmi & bmi < 18.5, "Underweight",
                                 ifelse(18.5 <= bmi & bmi < 25, "Normal weight",
                                        ifelse(25 <= bmi & bmi < 30, "Overweight", 
                                               ifelse(30 <= bmi & bmi < 35, "Obesity class I",
                                                      ifelse(35 <= bmi & bmi < 40, "Obesity class II","Obesity class III"))))))) |> 
  mutate(classification = recode(classification, "1" = "Healthy Controls", "2" = "Breast Cancer Patients")) |> arrange(bmi)



#check if I have recoded bmi correctly
table(bcdata$bmi, bcdata$who_bmi)

Q3 draw bar plot

#Since there's no healthy controls in underweight category, I added a category "underweight healthhy controls" with 0 count to make sure every column has the same width.
freq_table = bcdata |> group_by(who_bmi, classification) |>
  summarise(n = n()) |> mutate(proportion = n / sum(n) * 100)

supp = data.frame(who_bmi = "Underweight", 
              classification = "Healthy Controls",
              n = 0, proportion = 0)
final_freq = bind_rows(supp, freq_table) |> mutate(who_bmi = as.factor(who_bmi)) 

level = c("Severely underweight", "Underweight", "Normal weight", "Overweight", "Obesity class I", "Obesity class II", "Obesity class III")
final_freq |> 
  mutate(who_bmi = forcats::fct_relevel(who_bmi, level),
         text_label = str_c(proportion, "%")) |>
  plot_ly(x = ~who_bmi, y = ~proportion, color = ~classification, type = "bar", colors = "viridis", text = ~text_label)

But honestly, I think a barchart showing porportion of the whole sample within each category.(each category doesn’t accounted for 1) is more informative.